/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is StructureMerger.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
* Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk> (original author)
* Craig Macdonald <craigm{a.}dcs.gla.ac.uk>
*/
package org.terrier.structures.merging;
import gnu.trove.TIntIntHashMap;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Logger;
import org.terrier.compression.BitIn;
import org.terrier.structures.BasicDocumentIndexEntry;
import org.terrier.structures.BitIndexPointer;
import org.terrier.structures.DirectIndex;
import org.terrier.structures.DirectIndexInputStream;
import org.terrier.structures.DirectInvertedOutputStream;
import org.terrier.structures.DocumentIndexEntry;
import org.terrier.structures.FSOMapFileLexiconOutputStream;
import org.terrier.structures.FieldDirectInvertedOutputStream;
import org.terrier.structures.FieldDocumentIndexEntry;
import org.terrier.structures.FieldLexiconEntry;
import org.terrier.structures.Index;
import org.terrier.structures.IndexUtil;
import org.terrier.structures.InvertedIndex;
import org.terrier.structures.InvertedIndexInputStream;
import org.terrier.structures.LexiconEntry;
import org.terrier.structures.LexiconOutputStream;
import org.terrier.structures.MetaIndex;
import org.terrier.structures.PostingIndex;
import org.terrier.structures.PostingIndexInputStream;
import org.terrier.structures.SimpleBitIndexPointer;
import org.terrier.structures.SimpleDocumentIndexEntry;
import org.terrier.structures.indexing.CompressingMetaIndexBuilder;
import org.terrier.structures.indexing.DocumentIndexBuilder;
import org.terrier.structures.indexing.LexiconBuilder;
import org.terrier.structures.indexing.MetaIndexBuilder;
import org.terrier.structures.postings.BasicIterablePosting;
import org.terrier.structures.postings.FieldIterablePosting;
import org.terrier.structures.postings.IterablePosting;
import org.terrier.structures.postings.Posting;
import org.terrier.structures.postings.PostingIdComparator;
import org.terrier.structures.seralization.FixedSizeWriteableFactory;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
/**
* This class merges the structures created by Terrier, so that
* we use fewer and larger inverted and direct files.
* <p>
* <b>Properties:</b><ul>
* <li><tt>lexicon.use.hash</tt> - build a lexicon hash file for new index. Set to <tt>true</tt> by default.</li>
* <li><tt>merge.direct</tt> - merge the direct indices if both indices have them. Set to <tt>true</tt> by default.</li>
* @author Vassilis Plachouras and Craig Macdonald
*/
public class StructureMerger {
/** the logger used */
protected static final Logger logger = Logger.getLogger(StructureMerger.class);
/**
* A hashmap for converting the codes of terms appearing only in the
* vocabulary of the second set of data structures into a new set of
* term codes for the merged set of data structures.
*/
protected TIntIntHashMap termcodeHashmap = null;
protected boolean keepTermCodeMap = false;
/** The number of documents in the merged structures. */
protected int numberOfDocuments;
/** The number of pointers in the merged structures. */
protected long numberOfPointers;
/** The number of terms in the collection. */
protected int numberOfTerms;
protected boolean MetaReverse = Boolean.parseBoolean(ApplicationSetup.getProperty("merger.meta.reverse", "true"));
/** source index 1 */
protected Index srcIndex1;
/** source index 2 */
protected Index srcIndex2;
/** destination index */
protected Index destIndex;
/** class to use to write direct file */
protected Class<? extends DirectInvertedOutputStream> directFileOutputStreamClass = DirectInvertedOutputStream.class;
protected Class<? extends DirectInvertedOutputStream> fieldDirectFileOutputStreamClass = FieldDirectInvertedOutputStream.class;
/** class to use to write inverted file */
protected Class<? extends DirectInvertedOutputStream> invertedFileOutputStreamClass = DirectInvertedOutputStream.class;
/** class to use to write inverted file */
protected Class<? extends DirectInvertedOutputStream> fieldInvertedFileOutputStreamClass = FieldDirectInvertedOutputStream.class;
/** class to use to read the direct file */
protected String directFileInputClass = DirectIndex.class.getName();
/** class to use to read the direct file as a stream */
protected String directFileInputStreamClass = DirectIndexInputStream.class.getName();
/** class to use to read the inverted file */
protected String invertedFileInputClass = InvertedIndex.class.getName();
/** class to use to read the inverted file as a stream */
protected String invertedFileInputStreamClass = InvertedIndexInputStream.class.getName();
protected String basicInvertedIndexPostingIteratorClass = BasicIterablePosting.class.getName();
protected String fieldInvertedIndexPostingIteratorClass = FieldIterablePosting.class.getName();
protected String basicDirectIndexPostingIteratorClass = BasicIterablePosting.class.getName();
protected String fieldDirectIndexPostingIteratorClass = FieldIterablePosting.class.getName();
/**
* constructor
* @param _srcIndex1
* @param _srcIndex2
* @param _destIndex
*/
public StructureMerger(Index _srcIndex1, Index _srcIndex2, Index _destIndex)
{
this.srcIndex1 = _srcIndex1;
this.srcIndex2 = _srcIndex2;
this.destIndex = _destIndex;
numberOfDocuments = 0;
numberOfPointers = 0;
numberOfTerms = 0;
}
/**
* Sets the output index. This index should have no documents
* @param _outputIndex the index to be merged to
*/
public void setOutputIndex(Index _outputIndex) {
this.destIndex = _outputIndex;
//invertedFileOutput = _outputName;
}
/**
* Merges the two lexicons into one. After this stage, the offsets in the
* lexicon are ot correct. They will be updated only after creating the
* inverted file.
*/
@SuppressWarnings("unchecked")
protected void mergeInvertedFiles() {
try {
//getting the number of entries in the first document index,
//in order to assign the correct docids to the documents
//of the second inverted file.
int numberOfDocs1 = srcIndex1.getCollectionStatistics().getNumberOfDocuments();
int numberOfDocs2 = srcIndex2.getCollectionStatistics().getNumberOfDocuments();
numberOfDocuments = numberOfDocs1 + numberOfDocs2;
final int srcFieldCount1 = srcIndex1.getIntIndexProperty("index.inverted.fields.count", 0);
final int srcFieldCount2 = srcIndex1.getIntIndexProperty("index.inverted.fields.count", 0);
if (srcFieldCount1 != srcFieldCount2)
{
throw new Error("FieldCounts in source indices must match");
}
final int fieldCount = srcFieldCount1;
//creating a new map between new and old term codes
if (keepTermCodeMap)
termcodeHashmap = new TIntIntHashMap();
//setting the input streams
Iterator<Map.Entry<String,LexiconEntry>> lexInStream1 =
(Iterator<Map.Entry<String,LexiconEntry>>)srcIndex1.getIndexStructureInputStream("lexicon");
Iterator<Map.Entry<String,LexiconEntry>> lexInStream2 =
(Iterator<Map.Entry<String,LexiconEntry>>)srcIndex2.getIndexStructureInputStream("lexicon");
for(String property : new String[] {"index.inverted.fields.names", "max.term.length", "index.lexicon-keyfactory.class", "index.lexicon-keyfactory.parameter_values",
"index.lexicon-keyfactory.parameter_types", "index.lexicon-valuefactory.class", "index.lexicon-valuefactory.parameter_values",
"index.lexicon-valuefactory.parameter_types"} )
{
destIndex.setIndexProperty(property, srcIndex1.getIndexProperty(property, null));
}
FixedSizeWriteableFactory<LexiconEntry> lvf =
(FixedSizeWriteableFactory<LexiconEntry>)srcIndex1.getIndexStructure("lexicon-valuefactory");
//setting the output stream
LexiconOutputStream<String> lexOutStream =
new FSOMapFileLexiconOutputStream(destIndex, "lexicon", (Class <FixedSizeWriteableFactory<LexiconEntry>>) lvf.getClass());
int newCodes = (int)srcIndex1.getCollectionStatistics().getNumberOfUniqueTerms();
PostingIndex inverted1 = srcIndex1.getInvertedIndex();
PostingIndex inverted2 = srcIndex2.getInvertedIndex();
DirectInvertedOutputStream invOS =null;
try{
invOS = (fieldCount > 0 ? fieldInvertedFileOutputStreamClass : invertedFileOutputStreamClass)
.getConstructor(String.class)
.newInstance(destIndex.getPath() + ApplicationSetup.FILE_SEPARATOR +
destIndex.getPrefix() + ".inverted"+ BitIn.USUAL_EXTENSION);
} catch (Exception e) {
logger.error("Couldn't create specified DirectInvertedOutputStream", e);
return;
}
boolean hasMore1 = false;
boolean hasMore2 = false;
String term1;
String term2;
Map.Entry<String,LexiconEntry> lee1 = null;
Map.Entry<String,LexiconEntry> lee2 = null;
hasMore1 = lexInStream1.hasNext();
if (hasMore1)
lee1 = lexInStream1.next();
hasMore2 = lexInStream2.hasNext();
if (hasMore2)
lee2 = lexInStream2.next();
while (hasMore1 && hasMore2) {
term1 = lee1.getKey();
term2 = lee2.getKey();
int lexicographicalCompare = term1.compareTo(term2);
if (lexicographicalCompare < 0) {
//write to inverted file postings for the term that only occurs in 1st index
BitIndexPointer newPointer = invOS.writePostings(inverted1.getPostings(lee1.getValue()));
lee1.getValue().setPointer(newPointer);
numberOfPointers+=newPointer.getNumberOfEntries();
lexOutStream.writeNextEntry(term1, lee1.getValue());
hasMore1 = lexInStream1.hasNext();
if (hasMore1)
lee1 = lexInStream1.next();
} else if (lexicographicalCompare > 0) {
//write to inverted file postings for the term that only occurs in 2nd index
//docids are transformed as we go.
BitIndexPointer newPointer =
invOS.writePostings(inverted2.getPostings(lee2.getValue()), -(numberOfDocs1+1));
lee2.getValue().setPointer(newPointer);
numberOfPointers+=newPointer.getNumberOfEntries();
int newCode = newCodes++;
if (keepTermCodeMap)
termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
lee2.getValue().setTermId(newCode);
lexOutStream.writeNextEntry(term2, lee2.getValue());
hasMore2 = lexInStream2.hasNext();
if (hasMore2)
lee2 = lexInStream2.next();
} else {
//write to postings for a term that occurs in both indices
//1. postings from the first index are unchanged
IterablePosting ip1 = inverted1.getPostings(lee1.getValue());
BitIndexPointer newPointer1 = invOS.writePostings(ip1);
//2. postings from the 2nd index have their docids transformed
IterablePosting ip2 = inverted2.getPostings(lee2.getValue());
BitIndexPointer newPointer2 = invOS.writePostings(ip2, ip1.getId() - numberOfDocs1);
numberOfPointers+= newPointer1.getNumberOfEntries() + newPointer2.getNumberOfEntries();
//don't set numberOfEntries, as LexiconEntry.add() will take care of this.
lee1.getValue().setPointer(newPointer1);
if (keepTermCodeMap)
termcodeHashmap.put(lee2.getValue().getTermId(), lee1.getValue().getTermId());
lee1.getValue().add(lee2.getValue());
lexOutStream.writeNextEntry(term1, lee1.getValue());
hasMore1 = lexInStream1.hasNext();
if (hasMore1)
lee1 = lexInStream1.next();
hasMore2 = lexInStream2.hasNext();
if (hasMore2)
lee2 = lexInStream2.next();
}
}
if (hasMore1) {
lee2 = null;
while (hasMore1) {
//write to inverted file as well.
BitIndexPointer newPointer = invOS.writePostings(
inverted1.getPostings(lee1.getValue()));
lee1.getValue().setPointer(newPointer);
numberOfPointers+=newPointer.getNumberOfEntries();
lexOutStream.writeNextEntry(lee1.getKey(), lee1.getValue());
hasMore1 = lexInStream1.hasNext();
if (hasMore1)
lee1 = lexInStream1.next();
}
} else if (hasMore2) {
lee1 = null;
while (hasMore2) {
//write to inverted file as well.
BitIndexPointer newPointer = invOS.writePostings(
inverted2.getPostings(lee2.getValue()), -(numberOfDocs1+1));
lee2.getValue().setPointer(newPointer);
numberOfPointers+=newPointer.getNumberOfEntries();
int newCode = newCodes++;
if (keepTermCodeMap)
termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
lee2.getValue().setTermId(newCode);
lexOutStream.writeNextEntry(lee2.getKey(), lee2.getValue());
hasMore2 = lexInStream2.hasNext();
if (hasMore2)
lee2 = lexInStream2.next();
}
}
IndexUtil.close(lexInStream1);
IndexUtil.close(lexInStream2);
inverted1.close();
inverted2.close();
invOS.close();
destIndex.setIndexProperty("num.Documents", ""+numberOfDocuments);
destIndex.addIndexStructure(
"inverted",
invertedFileInputClass,
"org.terrier.structures.Index,java.lang.String,org.terrier.structures.DocumentIndex,java.lang.Class",
"index,structureName,document,"+
(fieldCount > 0
? fieldInvertedIndexPostingIteratorClass
: basicInvertedIndexPostingIteratorClass ));
destIndex.addIndexStructureInputStream(
"inverted",
invertedFileInputStreamClass,
"org.terrier.structures.Index,java.lang.String,java.util.Iterator,java.lang.Class",
"index,structureName,lexicon-entry-inputstream,"+
(fieldCount > 0
? fieldInvertedIndexPostingIteratorClass
: basicInvertedIndexPostingIteratorClass ));
destIndex.setIndexProperty("index.inverted.fields.count", ""+fieldCount);
lexOutStream.close();
if (fieldCount > 0)
{
destIndex.addIndexStructure("lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
}
destIndex.flush();
} catch(IOException ioe) {
logger.error("IOException while merging lexicons and inverted files.", ioe);
}
}
/**
* Merges the two direct files and the corresponding document id files.
*/
@SuppressWarnings("unchecked")
protected void mergeDirectFiles() {
try {
final DocumentIndexBuilder docidOutput = new DocumentIndexBuilder(destIndex, "document");
final String[] metaTags = ArrayUtils.parseCommaDelimitedString(srcIndex1.getIndexProperty("index.meta.key-names", "docno"));
final int[] metaTagLengths = ArrayUtils.parseCommaDelimitedInts(srcIndex1.getIndexProperty("index.meta.value-lengths", "20"));
final String[] metaReverseTags = MetaReverse
? ArrayUtils.parseCommaDelimitedString(srcIndex1.getIndexProperty("index.meta.reverse-key-names", "docno"))
: new String[0];
final MetaIndexBuilder metaBuilder = new CompressingMetaIndexBuilder(destIndex, metaTags, metaTagLengths, metaReverseTags);
if (! srcIndex1.getIndexProperty("index.meta.key-names", "docno").equals(srcIndex2.getIndexProperty("index.meta.key-names", "docno")))
{
throw new Error("Meta fields in source indices must match");
}
final BitIndexPointer emptyPointer = new SimpleBitIndexPointer();
final int srcFieldCount1 = srcIndex1.getIntIndexProperty("index.direct.fields.count", 0);
final int srcFieldCount2 = srcIndex1.getIntIndexProperty("index.direct.fields.count", 0);
if (srcFieldCount1 != srcFieldCount2)
{
throw new Error("FieldCounts in source indices must match");
}
final int fieldCount = srcFieldCount1;
for(String property : new String[] {"index.direct.fields.names","index.direct.fields.count" } )
{
destIndex.setIndexProperty(property, srcIndex1.getIndexProperty(property, null));
}
DirectInvertedOutputStream dfOutput = null;
try{
dfOutput =
(fieldCount > 0 ? fieldDirectFileOutputStreamClass : directFileOutputStreamClass)
.getConstructor(String.class)
.newInstance(destIndex.getPath() + ApplicationSetup.FILE_SEPARATOR +
destIndex.getPrefix() + ".direct" + BitIn.USUAL_EXTENSION);
} catch (Exception e) {
logger.error("Couldn't create specified DirectInvertedOutputStream", e);
return;
}
final Iterator<DocumentIndexEntry> docidInput1 = (Iterator<DocumentIndexEntry>)srcIndex1.getIndexStructureInputStream("document");
final PostingIndexInputStream dfInput1 = (PostingIndexInputStream)srcIndex1.getIndexStructureInputStream("direct");
final MetaIndex metaInput1 = srcIndex1.getMetaIndex();
int sourceDocid = 0;
//traversing the direct index, without any change
while(docidInput1.hasNext())
{
BitIndexPointer pointerDF = emptyPointer;
DocumentIndexEntry die = docidInput1.next();
if (die.getDocumentLength() > 0)
{
pointerDF = dfOutput.writePostings(dfInput1.next());
}
die.setBitIndexPointer(pointerDF);
docidOutput.addEntryToBuffer(die);
metaBuilder.writeDocumentEntry(metaInput1.getAllItems(sourceDocid));
sourceDocid++;
}
dfInput1.close();
metaInput1.close();
IndexUtil.close(docidInput1);
final Iterator<DocumentIndexEntry> docidInput2 = (Iterator<DocumentIndexEntry>)srcIndex2.getIndexStructureInputStream("document");
final PostingIndexInputStream dfInput2 = (PostingIndexInputStream)srcIndex2.getIndexStructureInputStream("direct");
final MetaIndex metaInput2 = srcIndex2.getMetaIndex();
sourceDocid = 0;
while (docidInput2.hasNext())
{
DocumentIndexEntry die = docidInput2.next();
BitIndexPointer pointerDF = emptyPointer;
if (die.getDocumentLength() > 0)
{
final IterablePosting postings = dfInput2.next();
List<Posting> postingList = new ArrayList<Posting>();
while(postings.next() != IterablePosting.EOL)
{
final Posting p = postings.asWritablePosting();
p.setId(termcodeHashmap.get(postings.getId()));
postingList.add(p);
}
Collections.sort(postingList, new PostingIdComparator());
pointerDF = dfOutput.writePostings(postingList.iterator());
}
die.setBitIndexPointer(pointerDF);
docidOutput.addEntryToBuffer(die);
metaBuilder.writeDocumentEntry(metaInput2.getAllItems(sourceDocid));
sourceDocid++;
}
dfInput2.close();
IndexUtil.close(docidInput2);
metaInput2.close();
metaBuilder.close();
dfOutput.close();
docidOutput.finishedCollections();
docidOutput.close();
destIndex.addIndexStructure(
"direct",
"org.terrier.structures.DirectIndex",
"org.terrier.structures.Index,java.lang.String,java.lang.Class",
"index,structureName,"+
(fieldCount > 0 ? fieldDirectIndexPostingIteratorClass : basicDirectIndexPostingIteratorClass));
destIndex.addIndexStructureInputStream(
"direct",
"org.terrier.structures.DirectIndexInputStream",
"org.terrier.structures.Index,java.lang.String,java.lang.Class",
"index,structureName,"+
(fieldCount > 0 ? fieldDirectIndexPostingIteratorClass : basicDirectIndexPostingIteratorClass));
if (fieldCount > 0)
{
destIndex.addIndexStructure("document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.direct.fields.count}");
}
else
{
destIndex.addIndexStructure("document-factory", BasicDocumentIndexEntry.Factory.class.getName(), "", "");
}
destIndex.flush();
} catch(IOException ioe) {
logger.error("IOException while merging df and docid files.", ioe);
}
}
protected static Class<?>[] getInterfaces(Object o)
{
List<Class<?>> list = new ArrayList<Class<?>>();
Class<?> c = o.getClass();
while(! c.equals(Object.class))
{
for(Class<?> i : c.getInterfaces())
{
list.add(i);
}
c = c.getSuperclass();
}
return list.toArray(new Class[0]);
}
/**
* Merges the two document index files, and the meta files.
*/
@SuppressWarnings("unchecked")
protected void mergeDocumentIndexFiles() {
try {
//the output docid file
final DocumentIndexBuilder docidOutput = new DocumentIndexBuilder(destIndex, "document");
final String[] metaTags = ArrayUtils.parseCommaDelimitedString(srcIndex1.getIndexProperty("index.meta.key-names", "docno"));
final int[] metaTagLengths = ArrayUtils.parseCommaDelimitedInts(srcIndex1.getIndexProperty("index.meta.value-lengths", "20"));
final String[] metaReverseTags = MetaReverse
? ArrayUtils.parseCommaDelimitedString(srcIndex1.getIndexProperty("index.meta.reverse-key-names", "docno"))
: new String[0];
final MetaIndexBuilder metaBuilder = new CompressingMetaIndexBuilder(destIndex, metaTags, metaTagLengths, metaReverseTags);
if (! srcIndex1.getIndexProperty("index.meta.key-names", "docno").equals(srcIndex2.getIndexProperty("index.meta.key-names", "docno")))
{
throw new Error("Meta fields in source indices must match");
}
//opening the first set of files.
final Iterator<DocumentIndexEntry> docidInput1 = (Iterator<DocumentIndexEntry>)srcIndex1.getIndexStructureInputStream("document");
final Iterator<String[]> metaInput1 = (Iterator<String[]>)srcIndex1.getIndexStructureInputStream("meta");
int srcFieldCount1 = srcIndex1.getIntIndexProperty("index.inverted.fields.count", 0);
int srcFieldCount2 = srcIndex2.getIntIndexProperty("index.inverted.fields.count", 0);
if (srcFieldCount1 != srcFieldCount2)
{
throw new Error("FieldCounts in source indices must match");
}
if (srcIndex1.getIndexProperty("index.document-factory.class", "").equals("org.terrier.structures.SimpleDocumentIndexEntry$Factory")
|| srcIndex1.getIndexProperty("index.document-factory.class", "").equals("org.terrier.structures.BasicDocumentIndexEntry$Factory"))
{
//for some reason, the source document index has not fields. so we shouldn't assume that fields are being used.
srcFieldCount1 = 0;
}
final int fieldCount = srcFieldCount1;
//traversing the first set of files, without any change
while(docidInput1.hasNext())
{
metaInput1.hasNext();
DocumentIndexEntry die = docidInput1.next();
DocumentIndexEntry dieNew = (fieldCount > 0) ? die : new SimpleDocumentIndexEntry(die);
docidOutput.addEntryToBuffer(dieNew);
metaBuilder.writeDocumentEntry(metaInput1.next());
}
final Iterator<DocumentIndexEntry> docidInput2 = (Iterator<DocumentIndexEntry>)srcIndex2.getIndexStructureInputStream("document");
final Iterator<String[]> metaInput2 = (Iterator<String[]>)srcIndex2.getIndexStructureInputStream("meta");
//traversing the 2nd set of files, without any change
while(docidInput2.hasNext())
{
metaInput2.hasNext();
DocumentIndexEntry die = docidInput2.next();
DocumentIndexEntry dieNew = (fieldCount > 0) ? die : new SimpleDocumentIndexEntry(die);
docidOutput.addEntryToBuffer(dieNew);
metaBuilder.writeDocumentEntry(metaInput2.next());
}
docidOutput.finishedCollections();
docidOutput.close();
metaBuilder.close();
IndexUtil.close(docidInput1);
IndexUtil.close(docidInput2);
//destIndex.setIndexProperty("index.inverted.fields.count", ""+ fieldCount);
if (fieldCount > 0)
{
destIndex.addIndexStructure("document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
}
else
{
destIndex.addIndexStructure("document-factory", SimpleDocumentIndexEntry.Factory.class.getName(), "", "");
}
destIndex.flush();
} catch(IOException ioe) {
logger.error("IOException while merging docid files.", ioe);
}
}
/**
* creates the final term code to offset file, and the lexicon hash if enabled.
*/
protected void createLexidFile() {
LexiconBuilder.optimise(destIndex, "lexicon");
}
/**
* Merges the structures created by terrier.
*/
public void mergeStructures() {
final boolean bothInverted = srcIndex1.hasIndexStructure("inverted") && srcIndex2.hasIndexStructure("inverted");
final boolean bothDirect = srcIndex1.hasIndexStructure("direct") && srcIndex2.hasIndexStructure("direct");
final boolean bothLexicon = srcIndex1.hasIndexStructure("lexicon") && srcIndex2.hasIndexStructure("lexicon");
final long t1 = System.currentTimeMillis();
keepTermCodeMap = bothDirect;
long t2 = 0;
long t3 = 0;
long t4 = 0;
if (bothInverted)
{
mergeInvertedFiles();
t2 = System.currentTimeMillis();
//logger.info("merged inverted files in " + ((t2-t1)/1000.0d));
}
else if (bothLexicon)
{
new LexiconMerger(srcIndex1, srcIndex2, destIndex).mergeLexicons();
t2 = System.currentTimeMillis();
//logger.info("merged lexicons in " + ((t2-t1)/1000.0d));
}
else
{
//logger.warn("No inverted or lexicon - no merging of lexicons took place");
t2 = System.currentTimeMillis();
}
if (bothInverted || bothLexicon)
{
createLexidFile();
t3 = System.currentTimeMillis();
logger.debug("created lexid file and lex hash in " + ((t3-t2)/1000.0d));
}
t3 = System.currentTimeMillis();
if (! bothDirect || ApplicationSetup.getProperty("merge.direct","true").equals("false"))
{
mergeDocumentIndexFiles();
t4 = System.currentTimeMillis();
//logger.info("merged documentindex files in " + ((t4-t3)/1000.0d));
}
else
{
mergeDirectFiles();
t4 = System.currentTimeMillis();
//logger.info("merged direct files in " + ((t4-t3)/1000.0d));
}
if (keepTermCodeMap)
{
//save up some memory
termcodeHashmap.clear();
termcodeHashmap = null;
}
}
/** Usage: java org.terrier.structures.merging.StructureMerger [binary bits] [inverted file 1] [inverted file 2] [output inverted file] <p>
* Binary bits concerns the number of fields in use in the index. */
public static void main(String[] args) throws Exception {
if (args.length != 6)
{
logger.fatal("usage: java org.terrier.structures.merging.StructureMerger srcPath1 srcPrefix1 srcPath2 srcPrefix2 destPath1 destPrefix1 ");
logger.fatal("Exiting ...");
return;
}
Index.setIndexLoadingProfileAsRetrieval(false);
Index indexSrc1 = Index.createIndex(args[0], args[1]);
Index indexSrc2 = Index.createIndex(args[2], args[3]);
Index indexDest = Index.createNewIndex(args[4], args[5]);
StructureMerger sMerger = new StructureMerger(indexSrc1, indexSrc2, indexDest);
long start = System.currentTimeMillis();
//logger.info("started at " + (new Date()));
if (ApplicationSetup.getProperty("merger.onlylexicons","false").equals("true")) {
System.err.println("Use LexiconMerger");
return;
} else if (ApplicationSetup.getProperty("merger.onlydocids","false").equals("true")) {
sMerger.mergeDocumentIndexFiles();
} else {
sMerger.mergeStructures();
}
indexSrc1.close();
indexSrc2.close();
indexDest.close();
//logger.info("finished at " + (new Date()));
long end = System.currentTimeMillis();
//logger.info("time elapsed: " + ((end-start)*1.0d/1000.0d) + " sec.");
}
}